This project aims to develop a predictive model to identify the likelihood of bankruptcy for Taiwanese companies.
In a work setting and by leveraging machine learning techniques and financial data, the goal would be to assist stakeholders, such as investors, creditors, and regulatory bodies in making informed decisions.
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
cd '/content/drive/MyDrive/Python for data analysis/Project/taiwanese+bankruptcy+prediction'
/content/drive/MyDrive/Python for data analysis/Project/taiwanese+bankruptcy+prediction
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
import os
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.feature_selection import f_classif
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, make_scorer, auc, precision_recall_curve
from scipy.stats import norm
import joblib
import numpy as np
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
df = pd.read_csv('data.csv')
df
| Bankrupt? | ROA(C) before interest and depreciation before interest | ROA(A) before interest and % after tax | ROA(B) before interest and depreciation after tax | Operating Gross Margin | Realized Sales Gross Margin | Operating Profit Rate | Pre-tax net Interest Rate | After-tax net Interest Rate | Non-industry income and expenditure/revenue | ... | Net Income to Total Assets | Total assets to GNP price | No-credit Interval | Gross Profit to Sales | Net Income to Stockholder's Equity | Liability to Equity | Degree of Financial Leverage (DFL) | Interest Coverage Ratio (Interest expense to EBIT) | Net Income Flag | Equity to Liability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.370594 | 0.424389 | 0.405750 | 0.601457 | 0.601457 | 0.998969 | 0.796887 | 0.808809 | 0.302646 | ... | 0.716845 | 0.009219 | 0.622879 | 0.601453 | 0.827890 | 0.290202 | 0.026601 | 0.564050 | 1 | 0.016469 |
| 1 | 1 | 0.464291 | 0.538214 | 0.516730 | 0.610235 | 0.610235 | 0.998946 | 0.797380 | 0.809301 | 0.303556 | ... | 0.795297 | 0.008323 | 0.623652 | 0.610237 | 0.839969 | 0.283846 | 0.264577 | 0.570175 | 1 | 0.020794 |
| 2 | 1 | 0.426071 | 0.499019 | 0.472295 | 0.601450 | 0.601364 | 0.998857 | 0.796403 | 0.808388 | 0.302035 | ... | 0.774670 | 0.040003 | 0.623841 | 0.601449 | 0.836774 | 0.290189 | 0.026555 | 0.563706 | 1 | 0.016474 |
| 3 | 1 | 0.399844 | 0.451265 | 0.457733 | 0.583541 | 0.583541 | 0.998700 | 0.796967 | 0.808966 | 0.303350 | ... | 0.739555 | 0.003252 | 0.622929 | 0.583538 | 0.834697 | 0.281721 | 0.026697 | 0.564663 | 1 | 0.023982 |
| 4 | 1 | 0.465022 | 0.538432 | 0.522298 | 0.598783 | 0.598783 | 0.998973 | 0.797366 | 0.809304 | 0.303475 | ... | 0.795016 | 0.003878 | 0.623521 | 0.598782 | 0.839973 | 0.278514 | 0.024752 | 0.575617 | 1 | 0.035490 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6814 | 0 | 0.493687 | 0.539468 | 0.543230 | 0.604455 | 0.604462 | 0.998992 | 0.797409 | 0.809331 | 0.303510 | ... | 0.799927 | 0.000466 | 0.623620 | 0.604455 | 0.840359 | 0.279606 | 0.027064 | 0.566193 | 1 | 0.029890 |
| 6815 | 0 | 0.475162 | 0.538269 | 0.524172 | 0.598308 | 0.598308 | 0.998992 | 0.797414 | 0.809327 | 0.303520 | ... | 0.799748 | 0.001959 | 0.623931 | 0.598306 | 0.840306 | 0.278132 | 0.027009 | 0.566018 | 1 | 0.038284 |
| 6816 | 0 | 0.472725 | 0.533744 | 0.520638 | 0.610444 | 0.610213 | 0.998984 | 0.797401 | 0.809317 | 0.303512 | ... | 0.797778 | 0.002840 | 0.624156 | 0.610441 | 0.840138 | 0.275789 | 0.026791 | 0.565158 | 1 | 0.097649 |
| 6817 | 0 | 0.506264 | 0.559911 | 0.554045 | 0.607850 | 0.607850 | 0.999074 | 0.797500 | 0.809399 | 0.303498 | ... | 0.811808 | 0.002837 | 0.623957 | 0.607846 | 0.841084 | 0.277547 | 0.026822 | 0.565302 | 1 | 0.044009 |
| 6818 | 0 | 0.493053 | 0.570105 | 0.549548 | 0.627409 | 0.627409 | 0.998080 | 0.801987 | 0.813800 | 0.313415 | ... | 0.815956 | 0.000707 | 0.626680 | 0.627408 | 0.841019 | 0.275114 | 0.026793 | 0.565167 | 1 | 0.233902 |
6819 rows × 96 columns
This is a binary classification problem, where the goal is to predict whether a company will go bankrupt or not based on various financial features. There is a total of 6819 samples in the dataset with 96 features.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6819 entries, 0 to 6818 Data columns (total 96 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Bankrupt? 6819 non-null int64 1 ROA(C) before interest and depreciation before interest 6819 non-null float64 2 ROA(A) before interest and % after tax 6819 non-null float64 3 ROA(B) before interest and depreciation after tax 6819 non-null float64 4 Operating Gross Margin 6819 non-null float64 5 Realized Sales Gross Margin 6819 non-null float64 6 Operating Profit Rate 6819 non-null float64 7 Pre-tax net Interest Rate 6819 non-null float64 8 After-tax net Interest Rate 6819 non-null float64 9 Non-industry income and expenditure/revenue 6819 non-null float64 10 Continuous interest rate (after tax) 6819 non-null float64 11 Operating Expense Rate 6819 non-null float64 12 Research and development expense rate 6819 non-null float64 13 Cash flow rate 6819 non-null float64 14 Interest-bearing debt interest rate 6819 non-null float64 15 Tax rate (A) 6819 non-null float64 16 Net Value Per Share (B) 6819 non-null float64 17 Net Value Per Share (A) 6819 non-null float64 18 Net Value Per Share (C) 6819 non-null float64 19 Persistent EPS in the Last Four Seasons 6819 non-null float64 20 Cash Flow Per Share 6819 non-null float64 21 Revenue Per Share (Yuan ¥) 6819 non-null float64 22 Operating Profit Per Share (Yuan ¥) 6819 non-null float64 23 Per Share Net profit before tax (Yuan ¥) 6819 non-null float64 24 Realized Sales Gross Profit Growth Rate 6819 non-null float64 25 Operating Profit Growth Rate 6819 non-null float64 26 After-tax Net Profit Growth Rate 6819 non-null float64 27 Regular Net Profit Growth Rate 6819 non-null float64 28 Continuous Net Profit Growth Rate 6819 non-null float64 29 Total Asset Growth Rate 6819 non-null float64 30 Net Value Growth Rate 6819 non-null float64 31 Total Asset Return Growth Rate Ratio 6819 non-null float64 32 Cash Reinvestment % 6819 non-null float64 33 Current Ratio 6819 non-null float64 34 Quick Ratio 6819 non-null float64 35 Interest Expense Ratio 6819 non-null float64 36 Total debt/Total net worth 6819 non-null float64 37 Debt ratio % 6819 non-null float64 38 Net worth/Assets 6819 non-null float64 39 Long-term fund suitability ratio (A) 6819 non-null float64 40 Borrowing dependency 6819 non-null float64 41 Contingent liabilities/Net worth 6819 non-null float64 42 Operating profit/Paid-in capital 6819 non-null float64 43 Net profit before tax/Paid-in capital 6819 non-null float64 44 Inventory and accounts receivable/Net value 6819 non-null float64 45 Total Asset Turnover 6819 non-null float64 46 Accounts Receivable Turnover 6819 non-null float64 47 Average Collection Days 6819 non-null float64 48 Inventory Turnover Rate (times) 6819 non-null float64 49 Fixed Assets Turnover Frequency 6819 non-null float64 50 Net Worth Turnover Rate (times) 6819 non-null float64 51 Revenue per person 6819 non-null float64 52 Operating profit per person 6819 non-null float64 53 Allocation rate per person 6819 non-null float64 54 Working Capital to Total Assets 6819 non-null float64 55 Quick Assets/Total Assets 6819 non-null float64 56 Current Assets/Total Assets 6819 non-null float64 57 Cash/Total Assets 6819 non-null float64 58 Quick Assets/Current Liability 6819 non-null float64 59 Cash/Current Liability 6819 non-null float64 60 Current Liability to Assets 6819 non-null float64 61 Operating Funds to Liability 6819 non-null float64 62 Inventory/Working Capital 6819 non-null float64 63 Inventory/Current Liability 6819 non-null float64 64 Current Liabilities/Liability 6819 non-null float64 65 Working Capital/Equity 6819 non-null float64 66 Current Liabilities/Equity 6819 non-null float64 67 Long-term Liability to Current Assets 6819 non-null float64 68 Retained Earnings to Total Assets 6819 non-null float64 69 Total income/Total expense 6819 non-null float64 70 Total expense/Assets 6819 non-null float64 71 Current Asset Turnover Rate 6819 non-null float64 72 Quick Asset Turnover Rate 6819 non-null float64 73 Working capitcal Turnover Rate 6819 non-null float64 74 Cash Turnover Rate 6819 non-null float64 75 Cash Flow to Sales 6819 non-null float64 76 Fixed Assets to Assets 6819 non-null float64 77 Current Liability to Liability 6819 non-null float64 78 Current Liability to Equity 6819 non-null float64 79 Equity to Long-term Liability 6819 non-null float64 80 Cash Flow to Total Assets 6819 non-null float64 81 Cash Flow to Liability 6819 non-null float64 82 CFO to Assets 6819 non-null float64 83 Cash Flow to Equity 6819 non-null float64 84 Current Liability to Current Assets 6819 non-null float64 85 Liability-Assets Flag 6819 non-null int64 86 Net Income to Total Assets 6819 non-null float64 87 Total assets to GNP price 6819 non-null float64 88 No-credit Interval 6819 non-null float64 89 Gross Profit to Sales 6819 non-null float64 90 Net Income to Stockholder's Equity 6819 non-null float64 91 Liability to Equity 6819 non-null float64 92 Degree of Financial Leverage (DFL) 6819 non-null float64 93 Interest Coverage Ratio (Interest expense to EBIT) 6819 non-null float64 94 Net Income Flag 6819 non-null int64 95 Equity to Liability 6819 non-null float64 dtypes: float64(93), int64(3) memory usage: 5.0 MB
pd.set_option('display.max_rows', None)
print(df.isnull().sum())
Bankrupt? 0 ROA(C) before interest and depreciation before interest 0 ROA(A) before interest and % after tax 0 ROA(B) before interest and depreciation after tax 0 Operating Gross Margin 0 Realized Sales Gross Margin 0 Operating Profit Rate 0 Pre-tax net Interest Rate 0 After-tax net Interest Rate 0 Non-industry income and expenditure/revenue 0 Continuous interest rate (after tax) 0 Operating Expense Rate 0 Research and development expense rate 0 Cash flow rate 0 Interest-bearing debt interest rate 0 Tax rate (A) 0 Net Value Per Share (B) 0 Net Value Per Share (A) 0 Net Value Per Share (C) 0 Persistent EPS in the Last Four Seasons 0 Cash Flow Per Share 0 Revenue Per Share (Yuan ¥) 0 Operating Profit Per Share (Yuan ¥) 0 Per Share Net profit before tax (Yuan ¥) 0 Realized Sales Gross Profit Growth Rate 0 Operating Profit Growth Rate 0 After-tax Net Profit Growth Rate 0 Regular Net Profit Growth Rate 0 Continuous Net Profit Growth Rate 0 Total Asset Growth Rate 0 Net Value Growth Rate 0 Total Asset Return Growth Rate Ratio 0 Cash Reinvestment % 0 Current Ratio 0 Quick Ratio 0 Interest Expense Ratio 0 Total debt/Total net worth 0 Debt ratio % 0 Net worth/Assets 0 Long-term fund suitability ratio (A) 0 Borrowing dependency 0 Contingent liabilities/Net worth 0 Operating profit/Paid-in capital 0 Net profit before tax/Paid-in capital 0 Inventory and accounts receivable/Net value 0 Total Asset Turnover 0 Accounts Receivable Turnover 0 Average Collection Days 0 Inventory Turnover Rate (times) 0 Fixed Assets Turnover Frequency 0 Net Worth Turnover Rate (times) 0 Revenue per person 0 Operating profit per person 0 Allocation rate per person 0 Working Capital to Total Assets 0 Quick Assets/Total Assets 0 Current Assets/Total Assets 0 Cash/Total Assets 0 Quick Assets/Current Liability 0 Cash/Current Liability 0 Current Liability to Assets 0 Operating Funds to Liability 0 Inventory/Working Capital 0 Inventory/Current Liability 0 Current Liabilities/Liability 0 Working Capital/Equity 0 Current Liabilities/Equity 0 Long-term Liability to Current Assets 0 Retained Earnings to Total Assets 0 Total income/Total expense 0 Total expense/Assets 0 Current Asset Turnover Rate 0 Quick Asset Turnover Rate 0 Working capitcal Turnover Rate 0 Cash Turnover Rate 0 Cash Flow to Sales 0 Fixed Assets to Assets 0 Current Liability to Liability 0 Current Liability to Equity 0 Equity to Long-term Liability 0 Cash Flow to Total Assets 0 Cash Flow to Liability 0 CFO to Assets 0 Cash Flow to Equity 0 Current Liability to Current Assets 0 Liability-Assets Flag 0 Net Income to Total Assets 0 Total assets to GNP price 0 No-credit Interval 0 Gross Profit to Sales 0 Net Income to Stockholder's Equity 0 Liability to Equity 0 Degree of Financial Leverage (DFL) 0 Interest Coverage Ratio (Interest expense to EBIT) 0 Net Income Flag 0 Equity to Liability 0 dtype: int64
df_zero_count = (df == 0).sum()
pd.set_option('display.max_rows', None)
print(df_zero_count)
Bankrupt? 6599 ROA(C) before interest and depreciation before interest 1 ROA(A) before interest and % after tax 1 ROA(B) before interest and depreciation after tax 1 Operating Gross Margin 1 Realized Sales Gross Margin 1 Operating Profit Rate 1 Pre-tax net Interest Rate 1 After-tax net Interest Rate 1 Non-industry income and expenditure/revenue 1 Continuous interest rate (after tax) 1 Operating Expense Rate 1 Research and development expense rate 1424 Cash flow rate 1 Interest-bearing debt interest rate 891 Tax rate (A) 2568 Net Value Per Share (B) 1 Net Value Per Share (A) 1 Net Value Per Share (C) 1 Persistent EPS in the Last Four Seasons 1 Cash Flow Per Share 1 Revenue Per Share (Yuan ¥) 2 Operating Profit Per Share (Yuan ¥) 1 Per Share Net profit before tax (Yuan ¥) 1 Realized Sales Gross Profit Growth Rate 1 Operating Profit Growth Rate 1 After-tax Net Profit Growth Rate 1 Regular Net Profit Growth Rate 1 Continuous Net Profit Growth Rate 1 Total Asset Growth Rate 1 Net Value Growth Rate 1 Total Asset Return Growth Rate Ratio 1 Cash Reinvestment % 1 Current Ratio 1 Quick Ratio 1 Interest Expense Ratio 1 Total debt/Total net worth 1 Debt ratio % 1 Net worth/Assets 1 Long-term fund suitability ratio (A) 1 Borrowing dependency 1 Contingent liabilities/Net worth 1 Operating profit/Paid-in capital 1 Net profit before tax/Paid-in capital 1 Inventory and accounts receivable/Net value 1 Total Asset Turnover 8 Accounts Receivable Turnover 7 Average Collection Days 7 Inventory Turnover Rate (times) 1 Fixed Assets Turnover Frequency 8 Net Worth Turnover Rate (times) 1 Revenue per person 2 Operating profit per person 1 Allocation rate per person 6 Working Capital to Total Assets 1 Quick Assets/Total Assets 1 Current Assets/Total Assets 1 Cash/Total Assets 1 Quick Assets/Current Liability 1 Cash/Current Liability 1 Current Liability to Assets 1 Operating Funds to Liability 1 Inventory/Working Capital 1 Inventory/Current Liability 227 Current Liabilities/Liability 1 Working Capital/Equity 1 Current Liabilities/Equity 1 Long-term Liability to Current Assets 2569 Retained Earnings to Total Assets 1 Total income/Total expense 1 Total expense/Assets 1 Current Asset Turnover Rate 1 Quick Asset Turnover Rate 1 Working capitcal Turnover Rate 1 Cash Turnover Rate 1 Cash Flow to Sales 1 Fixed Assets to Assets 6 Current Liability to Liability 1 Current Liability to Equity 1 Equity to Long-term Liability 1 Cash Flow to Total Assets 1 Cash Flow to Liability 1 CFO to Assets 1 Cash Flow to Equity 1 Current Liability to Current Assets 1 Liability-Assets Flag 6811 Net Income to Total Assets 1 Total assets to GNP price 1 No-credit Interval 1 Gross Profit to Sales 1 Net Income to Stockholder's Equity 1 Liability to Equity 1 Degree of Financial Leverage (DFL) 1 Interest Coverage Ratio (Interest expense to EBIT) 1 Net Income Flag 0 Equity to Liability 1 dtype: int64
For some of the features here, there is a lot of 0 values. Some of them are genuine measurements like the target class "bankrupt?" or the "Liability-Assets Flag". However, some of them represent missing data, like the "Tax Rate (A) or "Long-term Liability to Current Assets". We're going to clean it by imputing the mean value.
df.columns
Index(['Bankrupt?', ' ROA(C) before interest and depreciation before interest',
' ROA(A) before interest and % after tax',
' ROA(B) before interest and depreciation after tax',
' Operating Gross Margin', ' Realized Sales Gross Margin',
' Operating Profit Rate', ' Pre-tax net Interest Rate',
' After-tax net Interest Rate',
' Non-industry income and expenditure/revenue',
' Continuous interest rate (after tax)', ' Operating Expense Rate',
' Research and development expense rate', ' Cash flow rate',
' Interest-bearing debt interest rate', ' Tax rate (A)',
' Net Value Per Share (B)', ' Net Value Per Share (A)',
' Net Value Per Share (C)', ' Persistent EPS in the Last Four Seasons',
' Cash Flow Per Share', ' Revenue Per Share (Yuan ¥)',
' Operating Profit Per Share (Yuan ¥)',
' Per Share Net profit before tax (Yuan ¥)',
' Realized Sales Gross Profit Growth Rate',
' Operating Profit Growth Rate', ' After-tax Net Profit Growth Rate',
' Regular Net Profit Growth Rate', ' Continuous Net Profit Growth Rate',
' Total Asset Growth Rate', ' Net Value Growth Rate',
' Total Asset Return Growth Rate Ratio', ' Cash Reinvestment %',
' Current Ratio', ' Quick Ratio', ' Interest Expense Ratio',
' Total debt/Total net worth', ' Debt ratio %', ' Net worth/Assets',
' Long-term fund suitability ratio (A)', ' Borrowing dependency',
' Contingent liabilities/Net worth',
' Operating profit/Paid-in capital',
' Net profit before tax/Paid-in capital',
' Inventory and accounts receivable/Net value', ' Total Asset Turnover',
' Accounts Receivable Turnover', ' Average Collection Days',
' Inventory Turnover Rate (times)', ' Fixed Assets Turnover Frequency',
' Net Worth Turnover Rate (times)', ' Revenue per person',
' Operating profit per person', ' Allocation rate per person',
' Working Capital to Total Assets', ' Quick Assets/Total Assets',
' Current Assets/Total Assets', ' Cash/Total Assets',
' Quick Assets/Current Liability', ' Cash/Current Liability',
' Current Liability to Assets', ' Operating Funds to Liability',
' Inventory/Working Capital', ' Inventory/Current Liability',
' Current Liabilities/Liability', ' Working Capital/Equity',
' Current Liabilities/Equity', ' Long-term Liability to Current Assets',
' Retained Earnings to Total Assets', ' Total income/Total expense',
' Total expense/Assets', ' Current Asset Turnover Rate',
' Quick Asset Turnover Rate', ' Working capitcal Turnover Rate',
' Cash Turnover Rate', ' Cash Flow to Sales', ' Fixed Assets to Assets',
' Current Liability to Liability', ' Current Liability to Equity',
' Equity to Long-term Liability', ' Cash Flow to Total Assets',
' Cash Flow to Liability', ' CFO to Assets', ' Cash Flow to Equity',
' Current Liability to Current Assets', ' Liability-Assets Flag',
' Net Income to Total Assets', ' Total assets to GNP price',
' No-credit Interval', ' Gross Profit to Sales',
' Net Income to Stockholder's Equity', ' Liability to Equity',
' Degree of Financial Leverage (DFL)',
' Interest Coverage Ratio (Interest expense to EBIT)',
' Net Income Flag', ' Equity to Liability'],
dtype='object')
columns_to_clean = [" Tax rate (A)", " Long-term Liability to Current Assets", " Inventory/Current Liability", " Interest-bearing debt interest rate"]
imputation_value = df[columns_to_clean].mean()
# Replace zero values with the mean value in the specified columns
df[columns_to_clean] = df[columns_to_clean].replace(0, imputation_value)
df.describe()
| Bankrupt? | ROA(C) before interest and depreciation before interest | ROA(A) before interest and % after tax | ROA(B) before interest and depreciation after tax | Operating Gross Margin | Realized Sales Gross Margin | Operating Profit Rate | Pre-tax net Interest Rate | After-tax net Interest Rate | Non-industry income and expenditure/revenue | ... | Net Income to Total Assets | Total assets to GNP price | No-credit Interval | Gross Profit to Sales | Net Income to Stockholder's Equity | Liability to Equity | Degree of Financial Leverage (DFL) | Interest Coverage Ratio (Interest expense to EBIT) | Net Income Flag | Equity to Liability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | ... | 6819.000000 | 6.819000e+03 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.000000 | 6819.0 | 6819.000000 |
| mean | 0.032263 | 0.505180 | 0.558625 | 0.553589 | 0.607948 | 0.607929 | 0.998755 | 0.797190 | 0.809084 | 0.303623 | ... | 0.807760 | 1.862942e+07 | 0.623915 | 0.607946 | 0.840402 | 0.280365 | 0.027541 | 0.565358 | 1.0 | 0.047578 |
| std | 0.176710 | 0.060686 | 0.065620 | 0.061595 | 0.016934 | 0.016916 | 0.013010 | 0.012869 | 0.013601 | 0.011163 | ... | 0.040332 | 3.764501e+08 | 0.012290 | 0.016934 | 0.014523 | 0.014463 | 0.015668 | 0.013214 | 0.0 | 0.050014 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 | 0.000000 |
| 25% | 0.000000 | 0.476527 | 0.535543 | 0.527277 | 0.600445 | 0.600434 | 0.998969 | 0.797386 | 0.809312 | 0.303466 | ... | 0.796750 | 9.036205e-04 | 0.623636 | 0.600443 | 0.840115 | 0.276944 | 0.026791 | 0.565158 | 1.0 | 0.024477 |
| 50% | 0.000000 | 0.502706 | 0.559802 | 0.552278 | 0.605997 | 0.605976 | 0.999022 | 0.797464 | 0.809375 | 0.303525 | ... | 0.810619 | 2.085213e-03 | 0.623879 | 0.605998 | 0.841179 | 0.278778 | 0.026808 | 0.565252 | 1.0 | 0.033798 |
| 75% | 0.000000 | 0.535563 | 0.589157 | 0.584105 | 0.613914 | 0.613842 | 0.999095 | 0.797579 | 0.809469 | 0.303585 | ... | 0.826455 | 5.269777e-03 | 0.624168 | 0.613913 | 0.842357 | 0.281449 | 0.026913 | 0.565725 | 1.0 | 0.052838 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 9.820000e+09 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 |
8 rows × 96 columns
plt.style.use('seaborn')
bar_df = df['Bankrupt?'].value_counts().reset_index()
plt.title('Company Bankruptcy Distributions', fontsize = 17)
plt.bar(bar_df['index'], bar_df['Bankrupt?'],
color=['Green', 'red'], tick_label=['Normal', 'Bankrupt'], width = 0.5)
plt.xlabel('Bankruptcy status')
plt.show()
a,b = len(df[df['Bankrupt?']==0]), len(df[df['Bankrupt?']==1])
len(df[df['Bankrupt?']==0])
print("Normal: ", a, "(", a/len(df)*100,'% of the dataset)', "Bankrupt: ", b, '(', b/len(df)*100,'% of the dataset)')
<ipython-input-3-749f2dd813f6>:1: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
plt.style.use('seaborn')
Normal: 6599 ( 96.77372048687491 % of the dataset) Bankrupt: 220 ( 3.2262795131250916 % of the dataset)
Our dataset is highly unbalanced which will affect our model performance, to solve this problem we're going to use some data preprocessing methods (Oversampling and SMOTE) and see which one can give us the best model performance.
corr = df.corr()
fig, ax = plt.subplots(1, 1, figsize=(15, 15))
img = ax.imshow(corr, cmap='magma', interpolation='nearest', aspect='auto')
ax.set_xticks(np.arange(len(corr.columns)), labels=list(corr.columns))
ax.set_yticks(np.arange(len(corr.columns)), labels=list(corr.columns))
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
plt.colorbar(img)
plt.show()
There are a lot of multicollinearity in the dataset which means that a regression model will not work properly on this dataset
Some features have a very high correlation between each other (for instance the 3 ROA and the 3 Net Value per share), which can cause issues in certain statistical models so we're going to remove them.
# Set a correlation threshold
correlation_threshold = 0.9
# Identify highly correlated features
highly_correlated_features = set()
for i in range(len(corr.columns)):
for j in range(i):
if abs(corr.iloc[i, j]) > correlation_threshold:
colname = corr.columns[i]
highly_correlated_features.add(colname)
# Drop highly correlated features
print("Original DataFrame shape:", df.shape)
df = df.drop(columns=highly_correlated_features)
# Display the filtered DataFrame
print("Filtered DataFrame shape:", df.shape)
Original DataFrame shape: (6819, 96) Filtered DataFrame shape: (6819, 77)
This removed 19 features from the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6819 entries, 0 to 6818 Data columns (total 71 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Bankrupt? 6819 non-null int64 1 ROA(C) before interest and depreciation before interest 6819 non-null float64 2 Operating Gross Margin 6819 non-null float64 3 Operating Profit Rate 6819 non-null float64 4 Non-industry income and expenditure/revenue 6819 non-null float64 5 Operating Expense Rate 6819 non-null float64 6 Research and development expense rate 6819 non-null float64 7 Cash flow rate 6819 non-null float64 8 Interest-bearing debt interest rate 6819 non-null float64 9 Tax rate (A) 6819 non-null float64 10 Net Value Per Share (B) 6819 non-null float64 11 Persistent EPS in the Last Four Seasons 6819 non-null float64 12 Cash Flow Per Share 6819 non-null float64 13 Revenue Per Share (Yuan ¥) 6819 non-null float64 14 Realized Sales Gross Profit Growth Rate 6819 non-null float64 15 Operating Profit Growth Rate 6819 non-null float64 16 After-tax Net Profit Growth Rate 6819 non-null float64 17 Continuous Net Profit Growth Rate 6819 non-null float64 18 Total Asset Growth Rate 6819 non-null float64 19 Net Value Growth Rate 6819 non-null float64 20 Total Asset Return Growth Rate Ratio 6819 non-null float64 21 Cash Reinvestment % 6819 non-null float64 22 Current Ratio 6819 non-null float64 23 Quick Ratio 6819 non-null float64 24 Interest Expense Ratio 6819 non-null float64 25 Total debt/Total net worth 6819 non-null float64 26 Debt ratio % 6819 non-null float64 27 Long-term fund suitability ratio (A) 6819 non-null float64 28 Borrowing dependency 6819 non-null float64 29 Contingent liabilities/Net worth 6819 non-null float64 30 Inventory and accounts receivable/Net value 6819 non-null float64 31 Total Asset Turnover 6819 non-null float64 32 Accounts Receivable Turnover 6819 non-null float64 33 Average Collection Days 6819 non-null float64 34 Inventory Turnover Rate (times) 6819 non-null float64 35 Fixed Assets Turnover Frequency 6819 non-null float64 36 Net Worth Turnover Rate (times) 6819 non-null float64 37 Revenue per person 6819 non-null float64 38 Operating profit per person 6819 non-null float64 39 Allocation rate per person 6819 non-null float64 40 Working Capital to Total Assets 6819 non-null float64 41 Quick Assets/Total Assets 6819 non-null float64 42 Current Assets/Total Assets 6819 non-null float64 43 Cash/Total Assets 6819 non-null float64 44 Quick Assets/Current Liability 6819 non-null float64 45 Cash/Current Liability 6819 non-null float64 46 Inventory/Working Capital 6819 non-null float64 47 Inventory/Current Liability 6819 non-null float64 48 Current Liabilities/Liability 6819 non-null float64 49 Working Capital/Equity 6819 non-null float64 50 Long-term Liability to Current Assets 6819 non-null float64 51 Retained Earnings to Total Assets 6819 non-null float64 52 Total income/Total expense 6819 non-null float64 53 Total expense/Assets 6819 non-null float64 54 Current Asset Turnover Rate 6819 non-null float64 55 Quick Asset Turnover Rate 6819 non-null float64 56 Working capitcal Turnover Rate 6819 non-null float64 57 Cash Turnover Rate 6819 non-null float64 58 Fixed Assets to Assets 6819 non-null float64 59 Cash Flow to Total Assets 6819 non-null float64 60 Cash Flow to Liability 6819 non-null float64 61 CFO to Assets 6819 non-null float64 62 Cash Flow to Equity 6819 non-null float64 63 Current Liability to Current Assets 6819 non-null float64 64 Liability-Assets Flag 6819 non-null int64 65 Total assets to GNP price 6819 non-null float64 66 No-credit Interval 6819 non-null float64 67 Degree of Financial Leverage (DFL) 6819 non-null float64 68 Interest Coverage Ratio (Interest expense to EBIT) 6819 non-null float64 69 Net Income Flag 6819 non-null int64 70 Equity to Liability 6819 non-null float64 dtypes: float64(68), int64(3) memory usage: 3.7 MB
X = df.drop('Bankrupt?', axis = 1)
Y = df['Bankrupt?']
scaler = StandardScaler()
X1= scaler.fit_transform(X)
model = RandomForestClassifier(n_estimators=150, max_depth=None, min_samples_split=3, criterion='entropy',random_state=42)
model.fit(X1, Y)
global_importances = pd.Series(model.feature_importances_, index=X.columns)
global_importances.sort_values(ascending=True, inplace=True)
plt.figure(figsize=(10, 10))
# Plot horizontal bar chart
global_importances.plot.barh(color='green')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Global Feature Importance - Random Forest")
plt.show()
model = MLPClassifier(activation = 'relu', solver = 'adam', alpha = 0.0001, max_iter=1000, hidden_layer_sizes=(10,), learning_rate='adaptive', early_stopping=True, random_state = 42)
model.fit(X, Y)
# Get weights of the input layer
input_layer_weights = model.coefs_[0]
# Sum the weights for each feature to get feature importance
feature_importances = np.sum(np.abs(input_layer_weights), axis=1)
global_importances = pd.Series(feature_importances, index=X.columns)
global_importances.sort_values(ascending=True, inplace=True)
plt.figure(figsize=(10, 10))
# Plot horizontal bar chart
global_importances.plot.barh(color='green')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Global Feature Importance - MLP Classifier")
plt.show()
model = GradientBoostingClassifier(n_estimators = 100, loss = 'log_loss', learning_rate = 0.1, max_depth = 3, min_samples_split = 3, random_state = 42)
model.fit(X, Y)
global_importances = pd.Series(model.feature_importances_, index=X.columns)
global_importances.sort_values(ascending=True, inplace=True)
plt.figure(figsize=(10, 10))
# Plot horizontal bar chart
global_importances.plot.barh(color='green')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Global Feature Importance - Gradient Boosting")
Text(0.5, 1.0, 'Global Feature Importance - Gradient Boosting')
It's hard to tell exactly which features are actually important for the different models we want to build and test. Moreover, we have high dimensional data so we're going to use PCA to reduce the number of features while retaining as much of the original variability as possible to avoid the curse of dimensionality and improve the computational cost. We made the arbitrary choice to retain 90% of the data to strike a balance between reducing dimensionality and retaining most of the variability in the original dataset but it can be adjusted.
n_components = 46
pca = PCA(n_components = n_components)
X = df.drop('Bankrupt?', axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
principal_components = pca.fit_transform(X_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratios:", explained_variance_ratio)
Explained Variance Ratios: [0.13302639 0.07201457 0.05035846 0.04722618 0.04234799 0.03221728 0.03049543 0.02934015 0.028018 0.02138866 0.02061347 0.01985918 0.01790796 0.01600078 0.01578127 0.01532797 0.01518607 0.01397292 0.01315409 0.01274948 0.01248888 0.01214463 0.01165158 0.01129505 0.01115503 0.01098243 0.01080837 0.01071019 0.01070481 0.0105216 0.01040354 0.01022688 0.01015939 0.01012103 0.00994858 0.00979118 0.00977354 0.00964706 0.00950135 0.00911215 0.00905145 0.00883177 0.00863573 0.00844508 0.00803731 0.00766375]
sum(explained_variance_ratio)
0.9087986564506394
exp_var_pca = pca.explained_variance_ratio_
# Cumulative sum of eigenvalues; This will be used to create step plot
# for visualizing the variance explained by each principal component.
cum_sum_eigenvalues = np.cumsum(exp_var_pca)
# Create the visualization plot
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
X = df.drop('Bankrupt?', axis=1)
y = df['Bankrupt?']
model = GradientBoostingClassifier()
#model = RandomForestClassifier()
#model = MLPClassifier()
# Initialize RFE
rfe = RFE(model, n_features_to_select=46)
X_rfe = rfe.fit_transform(X, y)
# Get selected features
selected_features = X.columns[rfe.support_]
df_rfe = df[selected_features]
df_rfe.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6819 entries, 0 to 6818 Data columns (total 46 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ROA(C) before interest and depreciation before interest 6819 non-null float64 1 Non-industry income and expenditure/revenue 6819 non-null float64 2 Operating Expense Rate 6819 non-null float64 3 Research and development expense rate 6819 non-null float64 4 Cash flow rate 6819 non-null float64 5 Interest-bearing debt interest rate 6819 non-null float64 6 Net Value Per Share (B) 6819 non-null float64 7 Persistent EPS in the Last Four Seasons 6819 non-null float64 8 Operating Profit Per Share (Yuan ¥) 6819 non-null float64 9 Realized Sales Gross Profit Growth Rate 6819 non-null float64 10 Operating Profit Growth Rate 6819 non-null float64 11 Continuous Net Profit Growth Rate 6819 non-null float64 12 Total Asset Growth Rate 6819 non-null float64 13 Net Value Growth Rate 6819 non-null float64 14 Total Asset Return Growth Rate Ratio 6819 non-null float64 15 Current Ratio 6819 non-null float64 16 Quick Ratio 6819 non-null float64 17 Interest Expense Ratio 6819 non-null float64 18 Debt ratio % 6819 non-null float64 19 Borrowing dependency 6819 non-null float64 20 Accounts Receivable Turnover 6819 non-null float64 21 Inventory Turnover Rate (times) 6819 non-null float64 22 Operating profit per person 6819 non-null float64 23 Allocation rate per person 6819 non-null float64 24 Working Capital to Total Assets 6819 non-null float64 25 Quick Assets/Total Assets 6819 non-null float64 26 Cash/Total Assets 6819 non-null float64 27 Quick Assets/Current Liability 6819 non-null float64 28 Cash/Current Liability 6819 non-null float64 29 Inventory/Working Capital 6819 non-null float64 30 Current Liabilities/Liability 6819 non-null float64 31 Working Capital/Equity 6819 non-null float64 32 Long-term Liability to Current Assets 6819 non-null float64 33 Retained Earnings to Total Assets 6819 non-null float64 34 Total income/Total expense 6819 non-null float64 35 Total expense/Assets 6819 non-null float64 36 Current Asset Turnover Rate 6819 non-null float64 37 Working capitcal Turnover Rate 6819 non-null float64 38 Cash Turnover Rate 6819 non-null float64 39 Equity to Long-term Liability 6819 non-null float64 40 Cash Flow to Liability 6819 non-null float64 41 Total assets to GNP price 6819 non-null float64 42 Net Income to Stockholder's Equity 6819 non-null float64 43 Degree of Financial Leverage (DFL) 6819 non-null float64 44 Interest Coverage Ratio (Interest expense to EBIT) 6819 non-null float64 45 Equity to Liability 6819 non-null float64 dtypes: float64(46) memory usage: 2.4 MB
We initially tried to use RFE to help us identify and select the most important features in the dataset by recursively removing the least important features. However, knowing that it assumes a linear relationship between features and the target variable. If the relationship is nonlinear, RFE may not effectively capture the importance of nonlinear features so we compared the performance of the models using RFE / PCA.
Let's start by plotting some features that might interest us
f, axes = plt.subplots(ncols=4, figsize=(24,6))
sns.boxplot(x='Bankrupt?', y=" Net Income to Total Assets", data=df, ax=axes[0])
axes[0].set_title('Bankrupt vs Net Income to Total Assets')
sns.boxplot(x='Bankrupt?', y=" Total debt/Total net worth", data=df, ax=axes[1])
axes[1].set_title('Bankrupt vs Tot Debt/Net worth Correlation')
sns.boxplot(x='Bankrupt?', y=" Debt ratio %", data=df, ax=axes[2])
axes[2].set_title('Bankrupt vs Debt ratio Correlation')
sns.boxplot(x='Bankrupt?', y=" Net worth/Assets", data=df, ax=axes[3])
axes[3].set_title('Bankrupt vs Net Worth/Assets Correlation')
plt.show()
f, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(24, 6))
cash_flow_rate = df[' Net Income to Total Assets'].loc[df['Bankrupt?'] == 1].values
sns.distplot(cash_flow_rate,ax=ax1, fit=norm, color='#FB8861')
ax1.set_title(' Net Income to Total Assets \n (Unstable companies)', fontsize=14)
tot_debt_net = df[' Total debt/Total net worth'].loc[df['Bankrupt?'] == 1].values
sns.distplot(tot_debt_net ,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('total debt/tot net worth \n (Unstable companies)', fontsize=14)
debt_ratio = df[' Debt ratio %'].loc[df['Bankrupt?'] == 1].values
sns.distplot(debt_ratio,ax=ax3, fit=norm, color='#C5B3F9')
ax3.set_title('debt_ratio \n (Unstable companies)', fontsize=14)
net_worth_assets = df[' Net worth/Assets'].loc[df['Bankrupt?'] == 1].values
sns.distplot(net_worth_assets,ax=ax4, fit=norm, color='#C5B3F9')
ax4.set_title('net worth/assets \n (Unstable companies)', fontsize=14)
plt.show()
<ipython-input-70-541a739c6c12>:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(cash_flow_rate,ax=ax1, fit=norm, color='#FB8861') <ipython-input-70-541a739c6c12>:8: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(tot_debt_net ,ax=ax2, fit=norm, color='#56F9BB') <ipython-input-70-541a739c6c12>:13: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(debt_ratio,ax=ax3, fit=norm, color='#C5B3F9') <ipython-input-70-541a739c6c12>:17: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(net_worth_assets,ax=ax4, fit=norm, color='#C5B3F9')
df_bankrupt = df[df['Bankrupt?'] == 1]
df_not_bankrupt = df[df['Bankrupt?'] == 0]
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df_bankrupt.index , y = ' ROA(C) before interest and % after tax', hue = 'Bankrupt?', data = df_bankrupt)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df_not_bankrupt.index , y = ' ROA(C) before interest and % after tax', hue = 'Bankrupt?', data = df_not_bankrupt)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df_bankrupt.index , y = ' Cash flow rate', hue='Bankrupt?', data=df_bankrupt)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df_not_bankrupt.index , y = ' Cash flow rate', hue='Bankrupt?', data=df_not_bankrupt)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df_bankrupt.index , y=' Debt ratio %', hue = 'Bankrupt?', data=df_bankrupt)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df_not_bankrupt.index , y=' Debt ratio %', hue = 'Bankrupt?', data=df_not_bankrupt)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df_bankrupt.index , y=' Net Income to Total Assets', hue = 'Bankrupt?', data=df_bankrupt)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df_not_bankrupt.index , y=' Net Income to Total Assets', hue = 'Bankrupt?', data=df_not_bankrupt)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df.index , y = ' ROA(C) before interest and % after tax', hue = 'Bankrupt?', data = df)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df.index , y = ' Cash flow rate', hue='Bankrupt?', data=df)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df.index , y=' Debt ratio %', hue = 'Bankrupt?', data=df)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df.index , y=' Net Income to Total Assets', hue = 'Bankrupt?', data=df)
plt.show()
fig = plt.figure(figsize = (20,20))
sns.scatterplot(x = df.index , y=' Current Liability to Assets', hue = 'Bankrupt?', data=df)
plt.show()
These scatter point plots show that there are multiple outliers but because the data is highly imbalanced, we cannot afford to drop data. Moreover, we can see that lower values of "Net income to Total assets", "ROA (A) before interest and % after tax" and high values of "Debt Ratio %" have a tendency to lead to bankruptcy.
features_to_visualize = [' ROA(C) before interest and depreciation before interest', ' Debt ratio %']
for feature in features_to_visualize:
plt.figure(figsize=(8, 6))
sns.histplot(df[feature], bins=30, kde=True)
plt.title(f'Distribution of {feature}')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(8, 6))
sns.histplot(df[df['Bankrupt?'] == 1][' Debt ratio %'], bins=30, kde=True)
plt.title('Distribution of Debt ratio% for Bankrupt Instances')
plt.xlabel('Debt ratio %')
plt.ylabel('Frequency')
plt.show()
features_to_visualize = [' ROA(C) before interest and depreciation before interest', ' Debt ratio %']
for feature in features_to_visualize:
plt.figure(figsize=(8, 6))
sns.histplot(df[feature], bins=30, kde=True)
plt.title(f'Distribution of {feature}')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(8, 6))
sns.histplot(df[df['Bankrupt?'] == 1][' Debt ratio %'], bins=30, kde=True)
plt.title('Distribution of Debt ratio% for Bankrupt Instances')
plt.xlabel('Debt ratio %')
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize = (20,20))
ax =sns.boxplot(data = df, orient="h")
ax.set_title('Data Boxplots', fontsize = 18)
ax.set(xscale="log")
plt.show()
df.hist(figsize = (35,30), bins = 50)
plt.show()
Now we're going to solve the class imbalance problem and test different models and try to find the best hyperparameters using GridSearch and cross-validation
X = df.drop("Bankrupt?", axis=1)
X_rfe = df_rfe
Y = df['Bankrupt?']
scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
#X_pca = principal_components
#X_pca_scaled = scaler.fit_transform(X_pca)
X_rfe_scaled = scaler.fit_transform(X_rfe)
smote_sampler = SMOTE()
ros = RandomOverSampler(random_state=0)
# Splitting into training and testing data without oversampling, we're choosing a test size of 50% to "undersample" the training data a bit but also to test on more bankrupt data samples
#x_train, x_test, y_train, y_test = train_test_split(X_scaled , Y, test_size=0.3, random_state=2) # Data without feature engineering / selection
x_train, x_test, y_train, y_test = train_test_split(X_rfe_scaled, Y, test_size=0.5, random_state=2) # RFE
#x_train, x_test, y_train, y_test = train_test_split(X_pca_scaled, Y, test_size=0.5, random_state=2) # PCA
# Oversampling only the training data
x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
#x_train, y_train = ros.fit_resample(x_train, y_train)
bar_df = y_train.value_counts().reset_index()
plt.title('Class Distribution', fontsize=20)
plt.bar(bar_df['index'],
bar_df['Bankrupt?'],
color=['red', 'blue'],
tick_label=['Bankrupt', 'normal']
)
plt.xlabel('Transaction type')
plt.show()
print('Training Data Shape : ', x_train.shape)
print('Training Labels Shape : ', y_train.shape)
print('Testing Data Shape : ', x_test.shape)
print('Testing Labels Shape : ', y_test.shape)
Training Data Shape : (6606, 46) Training Labels Shape : (6606,) Testing Data Shape : (3410, 46) Testing Labels Shape : (3410,)
For this kind of problem we want to be able to measure the performance of our model accurately so we're going to use various metrics including the f1-score, recall, precision and AUC ROC.
We're also going to plot the confusion matrix to see how many true positive/negative and false positive/negative our models predict.
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
#print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Gradient Boosting Classifier
param_grid = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 4, 5],
'min_samples_split': [2, 3, 4],
}
# Create a Gradient Boosting Classifier
base_classifier = GradientBoostingClassifier(loss='log_loss', random_state=42)
# Define the scoring metric (ROC AUC in this case)
scorer = make_scorer(roc_auc_score, greater_is_better=True)
grid_search = GridSearchCV(estimator=base_classifier, param_grid=param_grid, scoring=scorer, cv=3, n_jobs=-1)
# Perform the grid search on the training data
grid_search.fit(x_train, y_train)
# Get the best model and parameters from the grid search
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)
# Train the best model on the whole training dataset
best_model.fit(x_train, y_train)
# Evaluate the best model on the test set
pred_test = best_model.predict(x_test)
roc = roc_auc_score(y_test, pred_test)
best_cm = confusion_matrix(y_test, pred_test)
proba_test = best_model.predict_proba(x_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, proba_test)
pr_auc = auc(recall, precision)
# Plot the confusion matrix for the best model
plt.figure(figsize=(8, 4), dpi=100)
sns.set(rc={'axes.grid': False})
plot_confusion_matrix(best_cm, classes=['Normal', 'Bankrupt'])
# Plot performance scores
print('Test Set f1 score : ', f1_score(y_test, pred_test))
print('Test set precision : ', precision_score(y_test, pred_test))
print('Test set recall : ', recall_score(y_test, pred_test))
print("PR AUC:", pr_auc)
print('ROC : ', roc)
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100}
Confusion matrix, without normalization
Test Set f1 score : 0.18000000000000002
Test set precision : 0.21428571428571427
Test set recall : 0.15517241379310345
PR AUC: 0.22877144932042118
ROC : 0.5692864081037952
Random Forest Classifier
param_grid = {
'n_estimators': [50, 100],
'max_depth': [None, 10, 25],
'min_samples_split': [10, 15 ,20],
'min_samples_leaf': [1, 2, 4],
}
base_classifier = RandomForestClassifier(random_state=42)
scorer = make_scorer(roc_auc_score, greater_is_better=True)
grid_search = GridSearchCV(estimator=base_classifier, param_grid=param_grid, scoring=scorer, cv=3, n_jobs=-1)
# Perform the grid search on the training data
grid_search.fit(x_train, y_train)
# Get the best model and parameters from the grid search
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)
# Train the best model on the whole training dataset
best_model.fit(x_train, y_train)
# Evaluate the best model on the test set
pred_test = best_model.predict(x_test)
roc = roc_auc_score(y_test, pred_test)
best_cm = confusion_matrix(y_test, pred_test)
proba_test = best_model.predict_proba(x_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, proba_test)
pr_auc = auc(recall, precision)
# Plot the confusion matrix for the best model
plt.figure(figsize=(8, 4), dpi=100)
sns.set(rc={'axes.grid': False})
plot_confusion_matrix(best_cm, classes=['Normal', 'Bankrupt'])
# Plot performance scores
print('Test Set f1 score : ', f1_score(y_test, pred_test))
print('Test set precision : ', precision_score(y_test, pred_test))
print('Test set recall : ', recall_score(y_test, pred_test))
print("PR AUC:", pr_auc)
print('ROC : ', roc)
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Confusion matrix, without normalization
Test Set f1 score : 0.14285714285714288
Test set precision : 0.4166666666666667
Test set recall : 0.08620689655172414
PR AUC: 0.28869781972732184
ROC : 0.5413428848955805
Multi-layer Perceptron Classifier
nn_classifier = MLPClassifier(activation = 'relu', solver = 'adam', early_stopping=True, random_state=42)
param_grid = {
'hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,)],
'alpha': [0.0001, 0.001, 0.01, 0.05],
'learning_rate': ['constant', 'adaptive'],
}
scorer = make_scorer(roc_auc_score)
# Perform GridSearchCV
grid_search = GridSearchCV(nn_classifier, param_grid, scoring=scorer, cv=3, n_jobs=-1)
grid_search.fit(x_train, y_train)
# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)
# Train the best model on the whole training dataset
best_model.fit(x_train, y_train)
# Evaluate the best model on the test set
pred_test = best_model.predict(x_test)
roc = roc_auc_score(y_test, pred_test)
best_cm = confusion_matrix(y_test, pred_test)
proba_test = best_model.predict_proba(x_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, proba_test)
pr_auc = auc(recall, precision)
# Plot the confusion matrix for the best model
plt.figure(figsize=(8, 4), dpi=100)
sns.set(rc={'axes.grid': False})
plot_confusion_matrix(best_cm, classes=['Normal', 'Bankrupt'])
# Plot performance scores
print('Test Set f1 score : ', f1_score(y_test, pred_test))
print('Test set precision : ', precision_score(y_test, pred_test))
print('Test set recall : ', recall_score(y_test, pred_test))
print("PR AUC:", pr_auc)
print('ROC : ', roc)
Best Parameters: {'alpha': 0.01, 'hidden_layer_sizes': (40,), 'learning_rate': 'constant'}
Confusion matrix, without normalization
Test Set f1 score : 0.3440233236151603
Test set precision : 0.2576419213973799
Test set recall : 0.5175438596491229
PR AUC: 0.2654883975982102
ROC : 0.7329830948731051
# Save the model to a file
joblib.dump(best_model, 'MLPClassifier.pkl')
# Load the model that we just saved
lr = joblib.load('MLPClassifier.pkl')
# Saving the data columns from training
model_columns = list(X_rfe.columns)
joblib.dump(model_columns, 'model_columns.pkl')
['model_columns.pkl']
Gradient Boosting
Random Forest
MLP Classifier
Gradient Boosting
Random Forest
MLP Classifier
Gradient Boosting
Random Forest
MLP Classifier
Gradient Boosting
Random Forest
MLP Classifier
Gradient Boosting
Random Forest
MLP Classifier